Emile Cohen
July 2020
Goal: In this notebook, we want to understand what makes Endometrial Cancer a textbook case for the patterns we saw, and what are the major subcohorts that drive the signal.
%run -i '../../../../../utils/setup_environment.ipy'
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import fisher_exact, ranksums, chi2, norm
from statsmodels.sandbox.stats.multicomp import multipletests
import matplotlib.gridspec as gridspec
import pickle
data_path = '../../../../../data/'
data_wgd = data_path + 'impact-facets-tp53/processed/wgd/'
data_no_wgd = data_path + 'impact-facets-tp53/processed/no_wgd/'
# Export to HTML
!jupyter nbconvert --to html endometrial_cancer.ipynb
from functools import reduce
def get_hotspots(df: pd.DataFrame, Sample_Type: str, group: list = None, group_type:str = None):
data = df[df['Sample_Type'] == Sample_Type]
if group and group_type:
data = data[data[group_type].isin(group)]
data_1 = get_groupby(data,'tp53_spot_1', 'count'); data_2 = get_groupby(data,'tp53_spot_2', 'count'); data_3 = get_groupby(data,'tp53_spot_3', 'count') ; data_4 = get_groupby(data,'tp53_spot_4', 'count') ; data_5 = get_groupby(data,'tp53_spot_5', 'count')
series_data = [data_1,data_2,data_3,data_4,data_5]
df_merged = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True,
how='outer'), series_data).fillna(0)
df_merged.columns = ['count_1', 'count_2', 'count_3', 'count_4', 'count_5']
df_merged['total'] = df_merged.sum(axis=1)
df_merged = df_merged.sort_values(by='total', ascending=False)
df_merged = df_merged.drop('nan')
return df_merged
def get_hotspot_frac(df: pd.DataFrame, group_type:str = None, group: list = None, nb = 10):
if group_type and group:
df = df[df[group_type].isin(group)]
result = [['spot', '#', 'frac']]
for spot in get_groupby(df, 'tp53_spot_1', 'count').sort_values(by='count', ascending=False).head(nb).index.tolist():
result.append([spot,df[df['tp53_spot_1'] == spot].frac_genome_altered.shape[0], df[df['tp53_spot_1'] == spot].frac_genome_altered.median()])
return pd.DataFrame(result)
def boxplot_sampletype(df: pd.DataFrame, group:str, palette, order, metrics: str, figsize= (10,3), title: str = '', title_font: int=12, xlim=[0,1]):
fig=plt.figure(figsize=figsize)
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
sns.boxplot(y=metrics, x=group,data=df,ax=ax, dodge=False,order=order, palette=palette).set_title(title, weight='bold', fontsize=title_font)
groupby_ = get_groupby(df,group, 'count')
labels = []
for element in order:
labels.append(element + '\n('+ str(groupby_.loc[element].values[0])+')')
ax.set_xticklabels(labels)
style(ax)
ax.set_ylim(xlim)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
return fig, ax
# Let's give a look at medians and statistics
def get_statistics(df: pd.DataFrame, group:str, metrics: str, group_list: list):
group_1 = df[df[group] == group_list[0]][metrics]
group_2 = df[df[group] == group_list[1]][metrics]
median_1 = group_1.median()
median_2 = group_2.median()
statistic, p_value = ranksums(group_1.dropna().values,group_2.dropna().values)
results = [['', 'size', metrics],
[group_list[0], group_1.shape[0], median_1],
[group_list[1], group_2.shape[0], median_2],
['', 'Statistics', 'p-value'],
['', statistic, p_value]]
return pd.DataFrame(results)
def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10, tp53=False):
samples = master.Tumor_Id.tolist()
if tp53:
maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True]
else:
maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
h.columns = ['count']
h = h.sort_values(by='count', ascending=False).head(head)
return(h)
def create_co_drivers_table(master: pd.DataFrame, group_type:str, group_1: str, group_2: str):
master_group_1 = master[master[group_type] == group_1]
co_drivers_group_1 = get_major_codrivers(master=master_group_1,
maf=maf_cohort_nowgd,
head=100)
co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
master_group_2 = master[master[group_type] == group_2]
co_drivers_group_2 = get_major_codrivers(master=master_group_2,
maf=maf_cohort_nowgd,
head=100)
co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
co_drivers_groups = pd.merge(co_drivers_group_1, co_drivers_group_2, on='Hugo_Symbol')
co_drivers_groups['proportion_1'] = - co_drivers_groups['proportion_1']
return co_drivers_groups
cancer = 'Endometrial Cancer'
master_no_wgd = non_wgd_load_and_cut(data_path + 'impact-facets-tp53/processed/no_wgd/master_no_wgd.pkl')
master_wgd = pd.read_pickle(data_path + 'impact-facets-tp53/processed/wgd/master_wgd.pkl')
master_no_wgd_cancer = master_no_wgd[master_no_wgd['Cancer_Type'] == cancer]
master_wgd_cancer = master_wgd[master_wgd['Cancer_Type'] == cancer]
maf_cohort_nowgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/no_wgd/maf_cohort_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
maf_cohort_wgd = pd.read_csv(data_path + 'impact-facets-tp53/processed/wgd/maf_cohort_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
Breast Cancer is the biggest cancer in our cohort. Breast Cancer has a slightly low proportion of WGD - around 30%


Endometrial Cancer shows a significant difference in Genome Instability between TP53 Mono-Allelic and Bi-Allelic subgroups - and has a lot of samples in both groups.

In the TP53 subgroup Pan Cancer plot that follows, we can see 2 important signals:

In the following cells are the proportions of different groupo levels: on the right Primary samples, on the left Metastatic samples.


In WGD cohort, Genome Instability median is above 70% for all cancer types.
We still see difference between TP53 bi-allelic and mono-allelic states but those are not very significant:

In this section, our goal is to find subcohorts that lead the signals observed. Here are the different subcohort we will create:
In this section, we cut our cohort to only keep samples with exactly one TP53 mutation, for simplicity.
master_hotspot = master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] == 1]
get_hotspot_frac(df=master_hotspot,
group_type=None,
group=None)
h = get_groupby(master_hotspot,'tp53_vc_group_1', 'count').sort_values(by='count', ascending=False)
display(h)
h = h.T
h = h[mutation_list]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=11)
ax.set_title('Mutation Type - {} - No WGD'.format(cancer), weight='bold', fontsize=18)
plt.show()
fig, ax = boxplot_sampletype(df=master_hotspot,
group='tp53_vc_group_1',
palette=mutation_palette,
order=mutation_list,
metrics='frac_genome_altered',
figsize=(6,10),
title='Fraction of Genome Altered - {}'.format(cancer),
xlim=[0,1])
plt.show()
print('Number of Bi Allelic samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'no_tp53_res'].shape[0]))
print('')
print('Number of TP53 Residual samples (with 1 mut): ' + str(master_hotspot[master_hotspot['tp53_res_group'] == 'tp53_res'].shape[0]))
total_df = []
for group in ['no_tp53_res']:
h = get_groupby(master_hotspot[master_hotspot['tp53_res_group'] == group], 'tp53_vc_group_1', group).sort_values(by=group, ascending=False)
total_df.append(h)
h = h.T
h = h[mutation_list]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax, colormap="Accent")
if group == 'tp53_res':
ax.legend(['In Frame', 'Truncated', 'Missense', 'Hotspot 248','Hotspot 273','Hotspot 175', 'Other Hotspot'],loc='center left', bbox_to_anchor=(1.05, 0.5), fontsize=11)
else: ax.get_legend().remove()
ax.set_title('Mutation Type - {} - No WGD'.format(group), weight='bold', fontsize=18)
plt.show()
display_side_by_side(total_df[0])
for group in ['no_tp53_res']:
master_wt = master_hotspot[master_hotspot['tp53_res_group'] == group]
fig, ax = boxplot_sampletype(df=master_wt,
group='tp53_vc_group_1',
palette=mutation_palette,
order=mutation_list,
metrics='frac_genome_altered',
figsize=(6,10),
title='Fraction of Genome Altered - No WGD - {} subgroup'.format(group),
xlim=[0,1])
plt.show()
In this section we compare SNV and INDEL mutations. As in the previous section, we cut the cohort to keep only samples with exactly 1 tp53 mutation.
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='driver_mutation_count',
figsize=(8,12),
title='Driver Mutation Count - TP53 Subroups - No WGD',
xlim=[-0.1,50])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='snv_driver_mutation_count',
figsize=(8,12),
title='SNV Driver Mutation Count - TP53 Subroups - No WGD',
xlim=[-0.1,15])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='snv_driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='snv_driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='snv_driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='indel_driver_mutation_count',
figsize=(8,12),
title='INDEL Driver Mutation Count - TP53 Subroups - No WGD',
xlim=[-0.1,35])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
The idea here is to see if we have differences in Fraction of Genome Altered if we cut our Cancer cohort on the number of drivers per sample.
Do we have more instability with more INDEL Driver Mutations within the same subgroup?
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']
thr=6
def get_driver_groups(x):
if x.driver_mutation_count > thr:
return 'High Co-Driver Count'
if x.driver_mutation_count <= thr:
return 'Low Co-Driver Count'
master_no_wgd_cancer_wt['co_driver_group'] = master_no_wgd_cancer_wt.apply(get_driver_groups, axis=1)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_wt,
group='co_driver_group',
palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
order=['High Co-Driver Count', 'Low Co-Driver Count'],
metrics='frac_genome_altered',
figsize=(4,10),
title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - Co Driver Count (thr={}) - {}'.format(thr,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_no_wgd_cancer_wt,
group='co_driver_group',
metrics='frac_genome_altered',
group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
def plot_density(df: pd.DataFrame, xlabel='', ylabel='', title='',clip = (0,3), group = None, figsize=(5,5)):
sns.set_style("whitegrid", {'grid.color': '1.'})
fig, ax = plt.subplots(figsize=figsize)
data =df['tp53_ccf_1']
ax = sns.distplot(data,kde_kws={'clip': clip, "shade": True}, hist=False)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
ax.set_title('TP53 CCF for 1_WT in Non WGD Samples' + title + ' (' + str(len(data)) + ')', weight = 'bold')
mean=round(data.mean(),2) ; median=round(data.median(),2)
string = 'Mean: '+ str(mean) +'\nMedian: ' + str(median)
ax.axvline(mean, color='g', linestyle='-', label='Mean: '+ str(mean))
ax.axvline(median, color='b', linestyle='-', label='Median: ' + str(median))
ax.legend()
#ax.set_xlim([0,1])
plt.show()
master_high_count = master_no_wgd_cancer_wt[master_no_wgd_cancer_wt['co_driver_group'] == 'High Co-Driver Count']
master_low_count = master_no_wgd_cancer_wt[master_no_wgd_cancer_wt['co_driver_group'] == 'Low Co-Driver Count']
plot_density(df=master_high_count,
xlabel='TP53 CCF',
ylabel='density estimation',
title=' - High Co-Driver Count',
clip = (0,1),
group = None, figsize=(7,3))
plt.show()
plot_density(df=master_low_count,
xlabel='TP53 CCF',
ylabel='density estimation',
title=' - Low Co-Driver Count',
clip = (0,1),
group = None, figsize=(7,3))
plt.show()
So we see that samples with less co-drivers have a higher Genome Instability
master_no_wgd_cancer_het = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '0_HETLOSS']
thr=0
def get_driver_groups(x):
if x.indel_driver_mutation_count > thr:
return 'High Co-Driver Count'
if x.indel_driver_mutation_count <= thr:
return 'Low Co-Driver Count'
master_no_wgd_cancer_het['co_driver_group'] = master_no_wgd_cancer_het.apply(get_driver_groups, axis=1)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_het,
group='co_driver_group',
palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
order=['High Co-Driver Count', 'Low Co-Driver Count'],
metrics='frac_genome_altered',
figsize=(4,10),
title='Fraction of Genome Altered - 0_HETLOSS subgroup - INDEL Co Driver Count (thr={}) - {}'.format(thr,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_no_wgd_cancer_het,
group='co_driver_group',
metrics='frac_genome_altered',
group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
thr_ccf_1 = 0.9
def ccf_subgroup(x):
if x.tp53_ccf_1 <= thr_ccf_1: return 'low'
elif x.tp53_ccf_1 > thr_ccf_1: return 'high'
master_no_wgd_cancer['ccf_group'] = master_no_wgd_cancer.apply(ccf_subgroup, axis=1)
get_groupby(master_no_wgd_cancer, 'ccf_group', 'count')
pd.DataFrame(master_no_wgd_cancer[['ccf_group', 'tp53_group']].groupby([ 'tp53_group', 'ccf_group']).size())
master_no_wgd_cancer_low = master_no_wgd_cancer[(master_no_wgd_cancer['ccf_group'] == 'low') | (master_no_wgd_cancer['tp53_count'] == 0)]
get_groupby(master_no_wgd_cancer_low, 'tp53_group', 'count')
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_low,
group='tp53_group',
palette=palette,
order=group_list,
metrics='indel_driver_mutation_count',
figsize=(8,12),
title='INDEL Driver Mutation Count - TP53 Subroups - No WGD - Low TP53 CCF',
xlim=[-0.1,30])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer_low,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer_low,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer_low,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
master_no_wgd_cancer_low_wt = master_no_wgd_cancer_low[master_no_wgd_cancer_low['tp53_group'] == '1_WILD_TYPE']
thr=6
def get_driver_groups(x):
if x.driver_mutation_count > thr:
return 'High Co-Driver Count'
if x.driver_mutation_count <= thr:
return 'Low Co-Driver Count'
master_no_wgd_cancer_low_wt['co_driver_group'] = master_no_wgd_cancer_low_wt.apply(get_driver_groups, axis=1)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_low_wt,
group='co_driver_group',
palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
order=['High Co-Driver Count', 'Low Co-Driver Count'],
metrics='frac_genome_altered',
figsize=(4,10),
title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - Low TP53 CCF - Co Driver Count (thr={}) - {}'.format(thr,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_no_wgd_cancer_low_wt,
group='co_driver_group',
metrics='frac_genome_altered',
group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
master_no_wgd_cancer_high = master_no_wgd_cancer[(master_no_wgd_cancer['ccf_group'] == 'high') | (master_no_wgd_cancer['tp53_count'] == 0)]
get_groupby(master_no_wgd_cancer_high, 'tp53_group', 'count')
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_high,
group='tp53_group',
palette=palette,
order=group_list,
metrics='driver_mutation_count',
figsize=(8,12),
title='Driver Mutation Count - TP53 Subroups - No WGD - High TP53 CCF',
xlim=[-0.1,40])
plt.show()
display_side_by_side(get_statistics(df=master_no_wgd_cancer_high,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '0_HETLOSS']),
get_statistics(df=master_no_wgd_cancer_high,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['1_WILD_TYPE', '>=1_LOSS']),
get_statistics(df=master_no_wgd_cancer_high,
group='tp53_group',
metrics='indel_driver_mutation_count',
group_list=['>1muts', '>=1_LOSS']))
master_no_wgd_cancer_high_wt = master_no_wgd_cancer_high[master_no_wgd_cancer_high['tp53_group'] == '1_WILD_TYPE']
thr=6
def get_driver_groups(x):
if x.driver_mutation_count > thr:
return 'High Co-Driver Count'
if x.driver_mutation_count <= thr:
return 'Low Co-Driver Count'
master_no_wgd_cancer_high_wt['co_driver_group'] = master_no_wgd_cancer_high_wt.apply(get_driver_groups, axis=1)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_high_wt,
group='co_driver_group',
palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
order=['High Co-Driver Count', 'Low Co-Driver Count'],
metrics='frac_genome_altered',
figsize=(4,10),
title='Fraction of Genome Altered - 1_WILD_TYPE subgroup - High TP53 CCF - Co Driver Count (thr={}) - {}'.format(thr,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_no_wgd_cancer_high_wt,
group='co_driver_group',
metrics='frac_genome_altered',
group_list=['High Co-Driver Count', 'Low Co-Driver Count'])
get_hotspot_frac(df=master_no_wgd_cancer_high_wt[master_no_wgd_cancer_high_wt['co_driver_group'] == 'High Co-Driver Count'],
group_type=None,
group=None)
get_hotspot_frac(df=master_no_wgd_cancer_high_wt[master_no_wgd_cancer_high_wt['co_driver_group'] == 'Low Co-Driver Count'],
group_type=None,
group=None)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer_high_wt,
group='co_driver_group',
palette={'High Co-Driver Count': '#FF9900' , 'Low Co-Driver Count': '#146EB4'},
order=['High Co-Driver Count', 'Low Co-Driver Count'],
metrics='Patient_Current_Age',
figsize=(4,10),
title='Patient Age - 1_WILD_TYPE subgroup - High TP53 CCF - Co Driver Count (thr={}) - {}'.format(thr,cancer),
xlim=[20,100])
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis', fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(['High Co-Driver Count', 'Low Co-Driver Count'], range(len(['High Co-Driver Count', 'Low Co-Driver Count']))):
data = master_no_wgd_cancer_high_wt[master_no_wgd_cancer_high_wt['co_driver_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = ['#FF9900' ,'#146EB4'][i], ax=ax)
plt.show()
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
maf=maf_cohort_nowgd,
head=15)
codrivers_cancer_tp53 = get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_count'] >= 1],
maf=maf_cohort_nowgd,
head=15)
co_drivers = pd.merge(codrivers_cancer, codrivers_cancer_tp53, on='Hugo_Symbol')
co_drivers.columns = ['cancer', 'cancer_tp53']
co_drivers['ratio'] = co_drivers.apply(lambda x: 100*round(x.cancer_tp53/x.cancer, 4) , axis=1)
co_drivers = co_drivers.sort_values(by='ratio', ascending=False)
co_drivers
labels = []
for element in co_drivers.index.tolist():
labels.append(element + ' ('+ str(int(co_drivers.loc[element]['cancer']))+')')
ax = sns.barplot(y=co_drivers.index, x='ratio',data=co_drivers[['ratio']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Co-Drivers Enrichment in TP53 State')
codrivers_cancer
labels = []
codrivers_cancer = get_major_codrivers(master=master_no_wgd_cancer,
maf=maf_cohort_nowgd,
head=15,
tp53=True)
codrivers_cancer['proportion'] = codrivers_cancer.apply(lambda x: 100* round(x['count'] / codrivers_cancer.sum().values[0], 4), axis=1)
for element in codrivers_cancer.head(10).index.tolist():
labels.append(element + ' ('+ str(int(codrivers_cancer.loc[element]['count']))+')')
ax = sns.barplot(y=codrivers_cancer.head(10).index, x='proportion',data=codrivers_cancer.head(10)[['proportion']], color='#7F8C8D', saturation=.2)
ax.set_yticklabels(labels)
ax.set_title('Drivers Frequency in {}'.format(cancer))
co_drivers_res = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_res_group',
group_1='tp53_res',
group_2='no_tp53_res')
co_drivers_res
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_res[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#2ECC71','#1E8449'])
ax.legend(['TP53 Residual', 'No TP53 Residual'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [25, 20, 15, 10, 5, 0, 5, 10, 15, 20, 25]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
co_drivers_cnloh_loss = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='>=1_cnLOH',
group_2='>=1_LOSS')
co_drivers_cnloh_loss
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_cnloh_loss[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[4],mc[0]])
ax.legend(['>=1_cnLOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [20, 15, 10, 5, 0, 5, 10, 15, 20, 25]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
Proportion are the same in both groups
co_drivers_losses = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='0_HETLOSS',
group_2='>=1_LOSS')
co_drivers_losses
fig=plt.figure(figsize=(7,7))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_losses[['proportion_1', 'proportion_2']].head(10)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[5],mc[0]])
ax.legend(['0_HETLOSS', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
#a = [-40, -30, -20, -10, 0, 10, 20, 30, 40]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
Same proportions, enrichment in APC and KRAS
co_drivers_mult_cnloh = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='>1muts',
group_2='>=1_cnLOH')
co_drivers_mult_cnloh
get_major_codrivers(master=master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>1muts'],
maf=maf_cohort_nowgd,
head=100)
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_mult_cnloh[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[3],mc[4]])
ax.legend(['>1muts', '>=1_cnLOH'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
a = [-20, -10, 0, 10, 20, 30, 40]
ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
co_drivers_wt_loss = create_co_drivers_table(master=master_no_wgd_cancer,
group_type='tp53_group',
group_1='1_WILD_TYPE',
group_2='>=1_LOSS')
co_drivers_wt_loss
fig=plt.figure(figsize=(10,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_wt_loss[['proportion_1', 'proportion_2']].head(20)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = [mc[2],mc[0]])
ax.legend(['1_WILD_TYPE', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=15)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
#a = [-10, 0, 10, 20, 30, 40]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
def get_major_codrivers(master: pd.DataFrame, maf: pd.DataFrame, head:int = 10):
samples = master.Tumor_Id.tolist()
maf_filtered = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['driver'] == True][maf['Hugo_Symbol'] != 'TP53']
h = pd.DataFrame(maf_filtered[['Hugo_Symbol']].groupby(['Hugo_Symbol']).size())
h.columns = ['count']
h = h.sort_values(by='count', ascending=False).head(head)
return(h)
def create_co_drivers_table_wgd(master_1: pd.DataFrame, master_2: pd.DataFrame, group_type:str, group_1: str):
master_group_1 = master_1[master_1[group_type] == group_1]
co_drivers_group_1 = get_major_codrivers(master=master_group_1,
maf=maf_cohort_nowgd,
head=100)
co_drivers_group_1['proportion_1'] = co_drivers_group_1.apply(lambda x: 100* round(x['count'] / co_drivers_group_1.sum().values[0], 4), axis=1)
master_group_2 = master_2[master_2['tp53_count'] >=1][master_2['tp53_loh_status'] == True]
co_drivers_group_2 = get_major_codrivers(master=master_group_2,
maf=maf_cohort_wgd,
head=100)
co_drivers_group_2['proportion_2'] = co_drivers_group_2.apply(lambda x: 100* round(x['count'] / co_drivers_group_2.sum().values[0], 4), axis=1)
co_drivers_groups = pd.merge(co_drivers_group_2, co_drivers_group_1, on='Hugo_Symbol')
co_drivers_groups['proportion_2'] = - co_drivers_groups['proportion_2']
return co_drivers_groups
co_drivers_wgd_loss = create_co_drivers_table_wgd(master_1=master_no_wgd_cancer,
master_2=master_wgd_cancer,
group_type='tp53_group',
group_1='>=1_LOSS')
co_drivers_wgd_loss
fig=plt.figure(figsize=(8,8))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
co_drivers_wgd_loss[['proportion_2', 'proportion_1']].head(15)[::-1].plot.barh(stacked=True, ax=ax, width=1, color = ['#7F8C8D',mc[0]])
ax.legend(['WGD - TP53 - LOH', '>=1_LOSS'], fontsize=10)
ax.set_title('Co-Drivers Proportion per TP53 State', fontsize=14)
plt.yticks(fontsize=10)
ax.set_ylabel('')
a=ax.get_xticks().tolist()
#a = [-50, -40, -30, -20, -10, 0, 10, 20, 30, 40, 50]
#ax.set_xticklabels(a, fontsize=10)
plt.grid(b=None)
plt.show()
def get_master_codrivers(master: pd.DataFrame, maf: pd.DataFrame, symbol: str):
samples = master.Tumor_Id.tolist()
samples_final = maf[maf.Tumor_Sample_Barcode.isin(samples)][maf['Hugo_Symbol'] == symbol].Tumor_Sample_Barcode.tolist()
master_filtered = master[master.Tumor_Id.isin(samples_final)]
return master_filtered
master_no_wgd_cancer_cnloh = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_cnLOH']
master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
maf=maf_cohort_nowgd,
symbol='PIK3CA')
master_RB1 = get_master_codrivers(master=master_no_wgd_cancer_cnloh,
maf=maf_cohort_nowgd,
symbol='RB1')
master_no_wgd_cancer_cnloh['data'] = '>=1_cnLOH'
master_RB1['data'] = 'RB1'
master_PIK3CA['data'] = 'PIK3CA'
masters = [master_no_wgd_cancer_cnloh, master_RB1, master_PIK3CA]
allMasters = pd.concat(masters)
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_cnLOH')
ax.set_xlabel('')
master_no_wgd_cancer_loss = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '>=1_LOSS']
master_CDH1 = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='CDH1')
master_GATA3 = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='GATA3')
master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='PIK3CA')
master_no_wgd_cancer_loss['data'] = '>=1_loss'
master_CDH1['data'] = 'CDH1'
master_GATA3['data'] = 'GATA3'
master_PIK3CA['data'] = 'PIK3CA'
masters = [master_no_wgd_cancer_loss, master_CDH1, master_GATA3, master_PIK3CA]
allMasters = pd.concat(masters)
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - >=1_LOSS')
ax.set_xlabel('')
master_no_wgd_cancer_loss = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '0_HETLOSS']
master_ = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='APC')
master_CDH1 = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='CDH1')
master_GATA3 = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='GATA3')
master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_loss,
maf=maf_cohort_nowgd,
symbol='PIK3CA')
master_no_wgd_cancer_loss['data'] = '>=1_loss'
master_CDH1['data'] = 'CDH1'
master_GATA3['data'] = 'GATA3'
master_PIK3CA['data'] = 'PIK3CA'
masters = [master_no_wgd_cancer_loss, master_CDH1, master_GATA3, master_PIK3CA]
allMasters = pd.concat(masters)
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - 0_HETLOSS')
ax.set_xlabel('')
master_no_wgd_cancer_wt = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == '1_WILD_TYPE']
master_PIK3CA = get_master_codrivers(master=master_no_wgd_cancer_wt,
maf=maf_cohort_nowgd,
symbol='PIK3CA')
master_MAP3K1 = get_master_codrivers(master=master_no_wgd_cancer_wt,
maf=maf_cohort_nowgd,
symbol='MAP3K1')
master_no_wgd_cancer_wt['data'] = '1_WT'
master_PIK3CA['data'] = 'PIK3CA'
master_MAP3K1['data'] = 'MAP3K1'
masters = [master_no_wgd_cancer_wt, master_PIK3CA, master_MAP3K1]
allMasters = pd.concat(masters)
fig=plt.figure(figsize=(5,10))
ax = plt.subplot2grid(shape=(2,1), loc=(0,0), colspan=1)
allMasters[['frac_genome_altered', 'data']].boxplot(by="data", ax=ax)
ax.set_title('Fraction of Genome Altered - 1_WT')
ax.set_xlabel('')
Same here we take only samples with exactly 1 tp53 mutation (master_hotspot).
We have to define groups for CCF to see if there are differences between those groups. To have an idea of the CCF distribution we show here the distribution coming from the cancer_panel.

We see that our tp53_ccf distribution is very high for all subgroups. >=1_LOSS and 0_HETLOSS are the biggest subgroups - by far - and >=1_LOSS* has a very high CCF median.
It will be hard to cut the cohort based on the CCF. Let's try and see the size of the subcohorts:
master_ccf = master_no_wgd_cancer[(master_no_wgd_cancer['tp53_count'] == 1) | (master_no_wgd_cancer['tp53_group'] == '0_HETLOSS')]
thr_ccf_1 = 0.9 ; thr_ccf_2 = 0.95
def ccf_subgroup(x):
if x.tp53_ccf_1 <= thr_ccf_1: return 'low'
elif x.tp53_ccf_1 <= thr_ccf_2: return 'medium'
elif x.tp53_ccf_1 > thr_ccf_2: return 'high'
master_ccf['ccf_group'] = master_ccf.apply(ccf_subgroup, axis=1)
get_groupby(master_ccf, 'ccf_group', 'count')
thr_vaf_1 = 0.3 ; thr_vaf_2 = 0.4
def vaf_subgroup(x):
if x.tp53_vaf_1 <= thr_vaf_1: return 'low'
elif x.tp53_vaf_1 <= thr_vaf_2: return 'medium'
elif x.tp53_vaf_1 > thr_vaf_2: return 'high'
master_ccf['vaf_group'] = master_ccf.apply(vaf_subgroup, axis=1)
get_groupby(master_ccf, 'vaf_group', 'count')
fig, ax = boxplot_sampletype(df=master_ccf,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_ccf,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['0_HETLOSS', '1_WILD_TYPE'])
master_low = master_ccf[(master_ccf['vaf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_low,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - VAF < {} - {}'.format(thr_vaf_1,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_low,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '>=1_LOSS'])
master_med = master_ccf[(master_ccf['vaf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_med,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {} < VAF < {} - {}'.format(thr_vaf_1,thr_vaf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_med,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
master_high = master_ccf[(master_ccf['vaf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_high,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - VAF > {} - {}'.format(thr_vaf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_high,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
fig, ax = boxplot_sampletype(df=master_ccf,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_ccf,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['0_HETLOSS', '1_WILD_TYPE'])
master_low = master_ccf[(master_ccf['ccf_group'] == 'low') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_low,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE','0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - CCF < {} - {}'.format(thr_ccf_1,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_low,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
master_med = master_ccf[(master_ccf['ccf_group'] == 'medium') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_med,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - {} < CCF < {} - {}'.format(thr_ccf_1,thr_ccf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_med,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig, ax = boxplot_sampletype(df=master_high,
group='tp53_group',
palette=palette,
order=['1_WILD_TYPE', '0_HETLOSS', '>=1_LOSS', '>=1_cnLOH'],
metrics='frac_genome_altered',
figsize=(5,10),
title='Fraction of Genome Altered - CCF > {} - {}'.format(thr_ccf_2,cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_high,
group='tp53_group',
metrics='frac_genome_altered',
group_list=['1_WILD_TYPE', '0_HETLOSS'])
fig, ax = boxplot_sampletype(df=master_hotspot,
group='vaf_group',
palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
order=['low', 'medium', 'high'],
metrics='frac_genome_altered',
figsize=(3,10),
title='Fraction of Genome Altered - VAF levels - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_hotspot,
group='vaf_group',
metrics='frac_genome_altered',
group_list=['low', 'medium'])
fig, ax = boxplot_sampletype(df=master_hotspot,
group='ccf_group',
palette={'low': tab10[0] , 'medium': tab10[1], 'high':tab10[2]},
order=['low', 'medium', 'high'],
metrics='frac_genome_altered',
figsize=(3,10),
title='Fraction of Genome Altered - CCF levels - {}'.format(cancer),
xlim=[0,1])
plt.show()
get_statistics(df=master_hotspot,
group='ccf_group',
metrics='frac_genome_altered',
group_list=['low', 'medium'])
#fig=plt.figure(figsize=(10,3))
ax = plt.subplot2grid(shape=(4,1), loc=(0,0), colspan=1)
sns.boxplot(x='Patient_Current_Age',data=master_no_wgd_cancer, ax=ax).set_title('Patient Age - {}'.format(cancer), weight='bold', fontsize=14)
style(ax)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_res_group',
palette=palette_res,
order=res_group_list,
metrics='Patient_Current_Age',
figsize=(3,10),
title='Patient Current Age - {}'.format(cancer),
xlim=[20,100])
plt.show()
get_statistics(df=master_no_wgd_cancer,
group='tp53_res_group',
metrics='Patient_Current_Age',
group_list=['tp53_res', 'no_tp53_res'])
fig, ax = boxplot_sampletype(df=master_no_wgd_cancer,
group='tp53_group',
palette=palette,
order=group_list,
metrics='Patient_Current_Age',
figsize=(7,10),
title='Patient Current Age - {}'.format(cancer),
xlim=[20,100])
plt.show()
get_statistics(df=master_no_wgd_cancer,
group='tp53_group',
metrics='Patient_Current_Age',
group_list=['1_WILD_TYPE', '>=1_cnLOH'])
h = get_groupby(master_no_wgd_cancer,'Sex', 'count').sort_values(by='count', ascending=False)
display(h)
h = h.T
h = h[['Male', 'Female']]
fig = plt.figure(figsize=(6,1))
ax = plt.subplot()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
h_plot = h.plot(kind = 'barh', stacked=True, yticks=[], ax=ax)
ax.legend(['Male', 'Female'],loc='center left', bbox_to_anchor=(1.1, 0.5), fontsize=11)
ax.set_title('Sex Distribution - {} - No WGD'.format(cancer), weight='bold', fontsize=18)
plt.show()
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
data = master_no_wgd_cancer.dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
data = data[['tp53_group', 'tp53_res_group', 'Overall Survival Status 0/1', 'Overall_Survival_Months']]
ix1 = data['tp53_res_group'] == 'tp53_res'
ix2 = data['tp53_res_group'] == 'no_tp53_res'
T_exp, E_exp = data.loc[ix1, 'Overall_Survival_Months'], data.loc[ix1, 'Overall Survival Status 0/1']
T_con, E_con = data.loc[ix2, 'Overall_Survival_Months'], data.loc[ix2, 'Overall Survival Status 0/1']
results = logrank_test(T_exp, T_con, event_observed_A=E_exp, event_observed_B=E_con)
results.print_summary()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - Non-WGD Cohort - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list[:2], range(len(res_group_list[:2]))):
data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - WGD Cohort - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(['bi', 'tp53_res'], range(2)):
data = master_wgd_cancer[master_wgd_cancer['prewgd_tp53_group_1'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
plt.show()
from lifelines import KaplanMeierFitter
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - Non-WGD Cohort - {}'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_no_wgd_cancer[master_no_wgd_cancer['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
plt.show()
master_high = master_ccf[(master_ccf['ccf_group'] == 'high') | (master_ccf['tp53_group'] == '0_HETLOSS')]
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(res_group_list, range(len(res_group_list))):
data = master_high[master_high['tp53_res_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = res_palette_list[i], ax=ax)
except: pass
plt.show()
fig = plt.figure(figsize=(10,7))
ax = fig.add_subplot(111)
fig.suptitle('Survival Analysis - {} - High CCF'.format(cancer), fontsize=16, weight='bold')
kmf = KaplanMeierFitter()
for group,i in zip(group_list, range(len(group_list))):
data = master_high[master_high['tp53_group'] == group].dropna(subset=['Overall_Survival_Months', 'Overall_Survival_Status'])
try:
data['Overall Survival Status 0/1'] = data.apply(lambda x: 1 if x['Overall_Survival_Status'] == 'DECEASED' else 0, axis=1)
kmf.fit(np.array(data['Overall_Survival_Months']), event_observed=np.array(data['Overall Survival Status 0/1']), label= group)
kmf.plot_survival_function(color = palette_list[i], ax=ax)
except: pass
plt.show()